package docsim;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Class used for pre-processing of text files
 * @author oraut
 *
 */
public class Preprocessor
{
    private static final String EMPTY = "";
	private static final String NEWLINE = "\n";

    /**
     * Removes all punctuation characters from text
     * 
     * @param text
     * @return
     */
    public static String removePunctuation (String text) // tested
    {
        return text.replaceAll(CONSTANTS.PUNCTUATION_REGEX, CONSTANTS.CHAR_SPACE);
    }

    /**
     * Removes angular braces from HTML or XML text format. MUST be called
     * before removePunctucation()!
     * 
     * @param text
     * @return
     */
    public static String removeBraces (String text) // tested
    {
        return text.replaceAll(CONSTANTS.REGEX_BRACES, CONSTANTS.CHAR_SPACE);
    }
    
    public static String removeNumbers (String text) // tested
    {
        return text.replaceAll(CONSTANTS.REGEX_NUMBERS, CONSTANTS.CHAR_SPACE);
    }

    public static String preprocessText (String text)
    {
        return removeNumbers(removePunctuation(removeBraces(text))).replace(NEWLINE, EMPTY).trim().toLowerCase();
    }
    
    public static void main(String[] args) {
		String s = "nspecific with encephalomyopathy, failure to thrive, seizures, ophthalmoplegia, and sensorineural hearing loss.<xref ref-type=\"bibr\" rid=\"B1\">1</xref>,<xref ref-type=\"bibr\" rid=\"B2\">2</xref> Impaired energy production results from overall dysfunction of the mitochondrial respiratory chain, which is composed of five enzymatic complexes embedded in the inner mitochondrial membrane.</p><p>There is no specific treatment for ME, and only conservative care is available. One treatment is ketogenic diet therapy with a mitochondrial disease treatment cocktail of coenzyme Q10, riboflavin, L-carnitine, and high-dose multivitamins; some favorable results have been reported.<xref ref-type=7869198</pub-id></element-citation></ref></ref-list></back><floats-group><fig id=\"F1\" position=\"float\"><label>Fig. 1</label><caption><p>Two-year-old male with Leigh disease (case 2). Pre-treatment MRS shows high lactate peak (arrow) at basal ganglia. One year follow up MRS shows decrease of lactate (double arrows) and restoration of NAA. NAA, N-acetylaspartate.</p></caption><graphic xlink:href=\"ymj-51-672-g001\"/></fig><table-wrap id=\"T1\" position=\"float\"><label>Table 1</label><caption><p>Patient Characteristics According to Mitochondrial Enzyme Defect, MR Findings and MRS Findings</p></caption><graphic xlink:href=\"ymj-51-672-i001\"/><table-wrap-foot><fn><p>BG, basal ganglia; NAA, N-acetylaspartate; Cho, choline; Cr, creatinine.</p><p>Lactate doublet: doublet peak at 1.2 - 1.4 ppm, (++) marked increase, (+) positive, (-) negative.</p><p><sup>*</sup>Significant reduction between pre-treatment Cho/Cr and post-treatment Cho/Cr (<italic>p</italic> = 0.0058, paired t-test, two-tailed).</p></fn></table-wrap-foot></table-wrap></floats-group></article>o";
		long t1 = System.currentTimeMillis();
		System.out.println(preprocessText(s));
		long t2 = System.currentTimeMillis();
		System.out.println(t2-t1);
		System.out.println(preprocessText(s));
	}
}
